In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

In [2]:
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class
import pandas as pd


time: 1.03 s

Clean Raw Annotations

Load raw annotations


In [3]:
"""
# v4_annotated
user_blocked = [
                'annotated_onion_layer_5_rows_0_to_5000_raters_20',     
                'annotated_onion_layer_5_rows_0_to_10000',             
                'annotated_onion_layer_5_rows_0_to_10000_raters_3',          
                'annotated_onion_layer_5_rows_10000_to_50526_raters_10',
                'annotated_onion_layer_10_rows_0_to_1000',              
                'annotated_onion_layer_20_rows_0_to_1000',              
                'annotated_onion_layer_30_rows_0_to_1000',              
]

user_random = [
            'annotated_random_data_rows_0_to_5000_raters_20',
            'annotated_random_data_rows_5000_to_10000',
            'annotated_random_data_rows_5000_to_10000_raters_3',
            'annotated_random_data_rows_10000_to_20000_raters_10',
]

article_blocked = ['article_onion_layer_5_all_rows_raters_10',]
article_random = ['article_random_data_all_rows_raters_10',]
"""

user_blocked = [
            'user_blocked',
            'user_blocked_2',
            'user_blocked_3',
            'user_blocked_4',
            'user_blocked_layer_10',
            'user_blocked_layer_20',
            'user_blocked_layer_30',
]

user_random = [
            'user_random',
            'user_random_2',
            'user_random_3',
            'user_random_4',
            'user_random_extra_baselines',

]

article_blocked = [ 'article_blocked',
                    'article_blocked_layer_5_extra_baselines' ]


article_random = ['article_random',
                  'article_random_extra_baselines']



files = {
    'user': {'blocked': user_blocked, 'random': user_random},
    'article': {'blocked': article_blocked, 'random': article_random}
}


dfs = []

for ns, d in files.items():
    for sample, files in  d.items():
        for f in files:
            df = pd.read_csv('../../data/annotations/raw/%s/%s.csv' % (ns,f))
            df['src'] = f
            df['ns'] = ns
            df['sample'] = sample
            dfs.append(df)
df = pd.concat(dfs)
print('# annotations: ', df.shape[0])


# annotations:  1524236
time: 23.1 s

Make random and blocked samples disjoint


In [4]:
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()


Out[4]:
1    124631
2        93
Name: rev_id, dtype: int64
time: 1.52 s

In [5]:
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()


time: 251 ms

In [6]:
df.sample_count.value_counts()


Out[6]:
1    124631
2        93
Name: rev_id, dtype: int64
time: 3.94 ms

In [7]:
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'


time: 107 ms
/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [8]:
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()


Out[8]:
1    124724
Name: rev_id, dtype: int64
time: 251 ms

In [9]:
del df.sample_count


time: 2.27 ms

In [10]:
print('# annotations: ', df.shape[0])


# annotations:  1524236
time: 1.07 ms

Tidy is_harassment_or_attack column


In [11]:
df = tidy_labels(df)


time: 5.93 s

Remap aggression score


In [12]:
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)


time: 554 ms

Remove answers to test questions


In [13]:
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])


# annotations:  1524236
time: 862 ms

Remove annotations where revision could not be read


In [14]:
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])


# annotations:  1510976
time: 46.9 s

In [15]:
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])


# annotations:  1501494
time: 514 ms

Examine aggression_score or is_harassment_or_attack input


In [16]:
df['aggression_score'].value_counts(dropna=False)


Out[16]:
 0.0    1081861
-1.0     144269
 1.0      92441
-3.0      74081
-2.0      66210
 2.0      29848
 3.0      11902
NaN         882
Name: aggression_score, dtype: int64
time: 21.3 ms

In [17]:
df['is_harassment_or_attack'].value_counts(dropna=False)


Out[17]:
not_attack                                            1213696
recipient                                              150911
other                                                   40457
third_party                                             33592
recipient\nthird_party                                  10046
other\nnot_attack                                        9283
recipient\nnot_attack                                    6967
quoting                                                  6596
recipient\nthird_party\nquoting\nother\nnot_attack       5969
recipient\nother                                         4408
recipient\nthird_party\nquoting\nother                   2504
recipient\nthird_party\nnot_attack                       2496
third_party\nother                                       1906
recipient\nthird_party\nother                            1863
quoting\nnot_attack                                      1630
recipient\nthird_party\nquoting                          1606
recipient\nthird_party\nquoting\nnot_attack              1392
third_party\nnot_attack                                  1300
quoting\nother\nnot_attack                               1230
recipient\nother\nnot_attack                              830
quoting\nother                                            642
third_party\nquoting                                      610
recipient\nquoting                                        476
third_party\nquoting\nother                               361
recipient\nquoting\nnot_attack                            212
recipient\nquoting\nother                                 129
third_party\nquoting\nnot_attack                          117
third_party\nother\nnot_attack                             89
recipient\nthird_party\nother\nnot_attack                  66
NaN                                                        41
third_party\nquoting\nother\nnot_attack                    38
recipient\nquoting\nother\nnot_attack                      31
Name: is_harassment_or_attack, dtype: int64
time: 115 ms

Drop NAs in aggression_score or is_harassment_or_attack input


In [18]:
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
print('# annotations: ', df.shape[0])


# annotations:  1500571
time: 915 ms

Remove ambivalent is_harassment_or_attack annotations

An annotations is ambivalent if it was labeled as both an attack and not an attack


In [19]:
# remove all annotations from users who are ambivalent in 10% or more of revisions
# we consider these users unreliable
def ambivalent(s):
    return 'not_attack' in s and s!= 'not_attack'
df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
print('# annotations: ', df.shape[0])


# annotations:  1439146
time: 5.58 s

In [20]:
# remove all other ambivalent annotations
df = df.query('ambivalent==False')
print('# annotations: ', df.shape[0])


# annotations:  1434257
time: 2.58 s

Make sure that each rev was only annotated by the same worker once


In [21]:
df.groupby(['rev_id', '_worker_id']).size().value_counts()


Out[21]:
1    1431503
2       1377
dtype: int64
time: 569 ms

In [22]:
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])


# annotations:  1432880
time: 1.17 s

Filter out annotations for revisions with duplicated diff content


In [23]:
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])


123633
time: 154 ms

In [24]:
u_comments = comments.drop_duplicates(subset = ['clean_diff'])
print(u_comments.shape[0])


120218
time: 195 ms

In [25]:
comments[comments.duplicated(subset = ['clean_diff'])].head(5)


Out[25]:
_aggression_score _channel _city _country _created_at _golden _id _ip _is_harassment_or_attack _missed ... user_id user_text not_attack other quoting recipient third_party attack aggression ambivalent
825 NaN neodev Belgrade SRB 4/20/2016 14:37:26 False 1965035223 109.92.158.251 NaN NaN ... 20335199.0 Linkiscool99 1.0 0.0 0.0 0.0 0.0 0.0 0.0 False
1316 NaN clixsense Rio De Janeiro BRA 5/25/2016 17:40:23 False 1999580751 186.221.107.247 NaN NaN ... 9897.0 Kwekubo 1.0 0.0 0.0 0.0 0.0 0.0 0.0 False
1551 NaN clixsense Rio De Janeiro BRA 5/8/2016 13:47:38 False 1979100431 186.221.148.47 NaN NaN ... 11496785.0 M-m-moot 1.0 0.0 0.0 0.0 0.0 0.0 0.0 False
3103 NaN neodev Cairo EGY 4/21/2016 10:51:20 False 1965872978 197.44.120.129 NaN NaN ... 10928492.0 Horse Manure Again 0.0 1.0 0.0 0.0 0.0 1.0 1.0 False
3754 NaN neodev Belgrade SRB 5/8/2016 14:57:10 False 1979194169 77.46.214.221 NaN NaN ... 16328760.0 DavisJune 1.0 0.0 0.0 0.0 0.0 0.0 0.0 False

5 rows × 52 columns

time: 90.6 ms

In [26]:
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])


# annotations:  1395983
time: 2.96 s

Check that labels are not None


In [27]:
df['recipient'].value_counts(dropna=False)


Out[27]:
0.0    1240903
1.0     155080
Name: recipient, dtype: int64
time: 18.8 ms

In [28]:
df['attack'].value_counts(dropna=False)


Out[28]:
0.0    1163956
1.0     232027
Name: attack, dtype: int64
time: 14.2 ms

In [29]:
df['aggression'].value_counts(dropna=False)


Out[29]:
0.0    1141434
1.0     254549
Name: aggression, dtype: int64
time: 15.3 ms

Remove annotations from all revisions that were annotated less than 8 times


In [30]:
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index


time: 41.2 ms

In [31]:
counts.shape


Out[31]:
(120218, 2)
time: 1.82 ms

In [32]:
counts['n'].value_counts().head()


Out[32]:
10    56283
9     29208
8      7469
19     6907
20     6190
Name: n, dtype: int64
time: 3.79 ms

In [33]:
counts_enough = counts.query("n>=8")


time: 11.2 ms

In [34]:
counts_enough.shape


Out[34]:
(116179, 2)
time: 2.07 ms

In [35]:
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])


# annotations:  1368958
time: 1.05 s

Discard nuisance columns


In [50]:
df.columns


Out[50]:
Index(['_aggression_score', '_channel', '_city', '_country', '_created_at',
       '_golden', '_id', '_ip', '_is_harassment_or_attack', '_missed', '_na',
       '_region', '_started_at', '_tainted', '_trust', '_unit_id',
       '_worker_id', 'aggression_score', 'aggression_score_gold',
       'aggression_score_gold_reason', 'block_actions', 'block_params',
       'block_reasons', 'block_timestamps', 'clean_diff', 'diff',
       'insert_only', 'is_harassment_or_attack',
       'is_harassment_or_attack_gold', 'is_harassment_or_attack_gold_reason',
       'na', 'na_gold', 'na_gold_reason', 'ns', 'orig__golden', 'page_id',
       'page_title', 'rev_comment', 'rev_id', 'rev_timestamp', 'sample', 'src',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression', 'ambivalent'],
      dtype='object')
time: 2.36 ms

In [36]:
cols = ['rev_id', '_worker_id', 'ns', 'sample', 'src','clean_diff', 'diff', 'insert_only', 'page_id',
       'page_title', 'rev_comment', 'rev_timestamp', 
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression', 'aggression_score']
df = df[cols]


time: 1.59 s

Summary Stats


In [41]:
df.groupby(['ns', 'sample']).size()


Out[41]:
ns       sample 
article  blocked    351106
         random     233073
user     blocked    534054
         random     250725
dtype: int64
time: 359 ms

In [42]:
df.to_csv('../../data/annotations/clean/annotations.tsv', index=False, sep='\t')


time: 44.6 s

In [43]:
pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t').shape


Out[43]:
(1368958, 22)
time: 14.8 s